#################### Exploratory Data Analysis ######################
Station_GeoLocation_Longitute_Latitude_Elevation_EPSG_4326.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Station)
## [1] 8 4
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Station)
## Rows: 8
## Columns: 4
## $ longitude <dbl> 73.0167, 80.2500, 77.2000, 80.9330, 72.8500, 77.5833, 85…
## $ Latitude <dbl> 26.3000, 13.0667, 28.5833, 26.8667, 19.1167, 12.9667, 20…
## $ Elevation <int> 217, 6, 211, 110, 8, 920, NA, NA
## $ Location_Name <chr> "Bangalore", "Chennai", "Delhi", "Lucknow", "Mumbai", "R…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Station)
## [1] "longitude" "Latitude" "Elevation" "Location_Name"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Station)
## 'data.frame': 8 obs. of 4 variables:
## $ longitude : num 73 80.2 77.2 80.9 72.8 ...
## $ Latitude : num 26.3 13.1 28.6 26.9 19.1 ...
## $ Elevation : int 217 6 211 110 8 920 NA NA
## $ Location_Name: chr "Bangalore" "Chennai" "Delhi" "Lucknow" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Station)
## longitude Latitude Elevation Location_Name
## Min. :72.85 Min. :12.97 Min. : 6.0 Length:8
## 1st Qu.:76.15 1st Qu.:17.60 1st Qu.: 33.5 Class :character
## Median :78.92 Median :21.23 Median :160.5 Mode :character
## Mean :79.07 Mean :21.17 Mean :245.3
## 3rd Qu.:81.92 3rd Qu.:26.44 3rd Qu.:215.5
## Max. :85.83 Max. :28.58 Max. :920.0
## NA's :2
attach(Weather_Station)
## Only Elevation seems to have some missing data, lets zoom into them
Weather_Station[is.na(Elevation),]
## longitude Latitude Elevation Location_Name
## 7 85.8333 20.2500 NA Bubhneshwar
## 8 84.8833 22.2167 NA Rourkela
## Nothing special about why Bubhneshwar and Rourkela alone seems to have elevation missing
## No cleaning needed as there are no plans to make use of the elevation data of the stations
## To find outliers, draw a histogram
Bangalore_1990_2022_BangaloreCity.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bangalore)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bangalore)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bangalore)
## time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4 NA
## 2 02-01-1990 21.7 NA 26.5 0
## 3 03-01-1990 21.0 16.4 26.5 0
## 4 04-01-1990 20.8 NA 27.4 0
## 5 05-01-1990 20.4 14.2 26.1 0
## 6 06-01-1990 20.4 17.1 24.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bangalore)
## time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8 0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9 0.0
## 11892 23-07-2022 23.1 20.9 26.7 0.0
## 11893 24-07-2022 22.8 20.0 26.7 0.3
## 11894 25-07-2022 24.1 20.2 28.5 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Bangalore)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bangalore)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
## $ tmin: num 19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
## $ tmax: num 28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
## $ prcp: num NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bangalore)
## time tavg tmin tmax
## Length:11894 Min. :17.20 Min. : 9.30 Min. :19.80
## Class :character 1st Qu.:22.30 1st Qu.:18.10 1st Qu.:27.90
## Mode :character Median :23.50 Median :19.80 Median :29.50
## Mean :23.84 Mean :19.39 Mean :29.93
## 3rd Qu.:25.20 3rd Qu.:20.80 3rd Qu.:32.00
## Max. :32.40 Max. :27.90 Max. :39.20
## NA's :70 NA's :1389 NA's :629
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.414
## 3rd Qu.: 2.000
## Max. :271.300
## NA's :4620
## Lets see the table with values for missing time
sum(is.na(Weather_Bangalore))
## [1] 6708
## Ok there are about 6708 NAs
Chennai_1990_2022_Madras.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Chennai)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Chennai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 25.2, 24.9, 25.6, 25.7, 25.5, 24.7, 25.4, 25.6, 24.8, 24.7, 24.5,…
## $ tmin <dbl> 22.8, 21.7, 21.4, NA, 20.7, NA, 23.3, 22.0, 21.7, 20.7, 20.0, 18.…
## $ tmax <dbl> 28.4, 29.1, 29.8, 28.7, 28.4, 26.1, 27.0, 28.0, 28.5, 29.0, 28.8,…
## $ prcp <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 0.5, 18.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Chennai)
## time tavg tmin tmax prcp
## 1 01-01-1990 25.2 22.8 28.4 0.5
## 2 02-01-1990 24.9 21.7 29.1 0.0
## 3 03-01-1990 25.6 21.4 29.8 0.0
## 4 04-01-1990 25.7 NA 28.7 0.0
## 5 05-01-1990 25.5 20.7 28.4 0.0
## 6 06-01-1990 24.7 NA 26.1 0.5
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Chennai)
## time tavg tmin tmax prcp
## 11889 20-07-2022 28.9 26.2 33.0 9.3
## 11890 21-07-2022 28.4 24.5 32.8 21.1
## 11891 22-07-2022 27.8 24.6 32.2 22.1
## 11892 23-07-2022 27.4 24.7 32.6 18.6
## 11893 24-07-2022 27.8 25.0 33.3 9.1
## 11894 25-07-2022 28.1 25.4 32.6 2.9
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Chennai)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Chennai)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 25.2 24.9 25.6 25.7 25.5 24.7 25.4 25.6 24.8 24.7 ...
## $ tmin: num 22.8 21.7 21.4 NA 20.7 NA 23.3 22 21.7 20.7 ...
## $ tmax: num 28.4 29.1 29.8 28.7 28.4 26.1 27 28 28.5 29 ...
## $ prcp: num 0.5 0 0 0 0 0.5 18 0.5 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Chennai)
## time tavg tmin tmax
## Length:11894 Min. :20.90 Min. :12.00 Min. :23.80
## Class :character 1st Qu.:26.30 1st Qu.:22.60 1st Qu.:31.10
## Mode :character Median :28.70 Median :24.60 Median :34.00
## Mean :28.49 Mean :24.38 Mean :33.91
## 3rd Qu.:30.40 3rd Qu.:26.40 3rd Qu.:36.20
## Max. :36.60 Max. :31.00 Max. :44.60
## NA's :27 NA's :3084 NA's :1019
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 6.244
## 3rd Qu.: 3.000
## Max. :344.900
## NA's :4886
sum(is.na(Weather_Chennai))
## [1] 9016
## About 9016 entries are NA
Delhi_NCR_1990_2022_Safdarjung.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Delhi)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Delhi)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 9.4, 9.3, 9.0, 10.7, 12.6, 14.9, 14.4, 10.7, 13.4, 16.6, 17.0, 17…
## $ tmin <dbl> 6.0, 5.2, 6.5, 6.0, 7.3, 8.1, 8.1, 8.5, 7.0, NA, 10.9, 9.8, 8.8, …
## $ tmax <dbl> 15.1, 14.2, 13.6, 17.5, 20.8, 22.9, 21.4, 16.6, 20.6, 22.8, 25.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Delhi)
## time tavg tmin tmax prcp
## 1 01-01-1990 9.4 6.0 15.1 0
## 2 02-01-1990 9.3 5.2 14.2 0
## 3 03-01-1990 9.0 6.5 13.6 0
## 4 04-01-1990 10.7 6.0 17.5 0
## 5 05-01-1990 12.6 7.3 20.8 0
## 6 06-01-1990 14.9 8.1 22.9 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Delhi)
## time tavg tmin tmax prcp
## 11889 20-07-2022 30.1 26.5 33.2 14.7
## 11890 21-07-2022 28.6 26.8 30.6 21.2
## 11891 22-07-2022 29.3 27.0 32.9 0.3
## 11892 23-07-2022 30.1 25.5 34.9 8.9
## 11893 24-07-2022 30.6 27.1 35.7 0.0
## 11894 25-07-2022 30.7 26.8 35.7 0.0
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Delhi)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Delhi)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 9.4 9.3 9 10.7 12.6 14.9 14.4 10.7 13.4 16.6 ...
## $ tmin: num 6 5.2 6.5 6 7.3 8.1 8.1 8.5 7 NA ...
## $ tmax: num 15.1 14.2 13.6 17.5 20.8 22.9 21.4 16.6 20.6 22.8 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Delhi)
## time tavg tmin tmax
## Length:11894 Min. : 6.6 Min. : 0.10 Min. : 9.80
## Class :character 1st Qu.:18.5 1st Qu.:11.80 1st Qu.:26.70
## Mode :character Median :27.0 Median :20.00 Median :33.20
## Mean :25.0 Mean :18.88 Mean :31.79
## 3rd Qu.:30.9 3rd Qu.:26.00 3rd Qu.:36.60
## Max. :39.8 Max. :34.20 Max. :48.10
## NA's :94 NA's :1536 NA's :533
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 3.662
## 3rd Qu.: 0.500
## Max. :262.900
## NA's :6140
sum(is.na(Weather_Delhi))
## [1] 8303
## About 8303 entries are NA
Lucknow_1990_2022.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Lucknow)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Lucknow)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 7.2, 10.5, 10.2, 9.1, 13.5, 11.5, 14.2, 17.1, 11.1, 14.8, 12.9, 1…
## $ tmin <dbl> NA, NA, 1.8, NA, NA, 5.9, 5.4, NA, NA, 4.1, 5.1, 7.3, NA, 6.9, 9.…
## $ tmax <dbl> 18.1, 17.2, 18.6, 19.3, 23.8, 21.4, 23.6, 24.6, 24.6, 23.6, 23.6,…
## $ prcp <dbl> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, …
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Lucknow)
## time tavg tmin tmax prcp
## 1 01-01-1990 7.2 NA 18.1 0
## 2 02-01-1990 10.5 NA 17.2 0
## 3 03-01-1990 10.2 1.8 18.6 NA
## 4 04-01-1990 9.1 NA 19.3 0
## 5 05-01-1990 13.5 NA 23.8 0
## 6 06-01-1990 11.5 5.9 21.4 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Lucknow)
## time tavg tmin tmax prcp
## 11889 20-07-2022 28.6 25.1 33.1 17.7
## 11890 21-07-2022 27.4 25.1 33.1 27.3
## 11891 22-07-2022 28.1 26.1 31.1 16.0
## 11892 23-07-2022 30.3 26.2 34.7 11.9
## 11893 24-07-2022 30.0 28.1 34.7 2.0
## 11894 25-07-2022 27.1 24.1 34.3 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Lucknow)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Lucknow)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 7.2 10.5 10.2 9.1 13.5 11.5 14.2 17.1 11.1 14.8 ...
## $ tmin: num NA NA 1.8 NA NA 5.9 5.4 NA NA 4.1 ...
## $ tmax: num 18.1 17.2 18.6 19.3 23.8 21.4 23.6 24.6 24.6 23.6 ...
## $ prcp: num 0 0 NA 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Lucknow)
## time tavg tmin tmax
## Length:11894 Min. : 5.70 Min. :-0.6 Min. :11.10
## Class :character 1st Qu.:19.50 1st Qu.:12.5 1st Qu.:28.10
## Mode :character Median :27.20 Median :20.5 Median :33.40
## Mean :25.22 Mean :18.8 Mean :32.49
## 3rd Qu.:30.40 3rd Qu.:25.1 3rd Qu.:36.50
## Max. :39.70 Max. :32.7 Max. :47.30
## NA's :138 NA's :3515 NA's :1553
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.536
## 3rd Qu.: 1.000
## Max. :470.900
## NA's :6152
sum(is.na(Weather_Lucknow))
## [1] 11358
## About 11358 entries are NA
Mumbai_1990_2022_Santacruz.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Mumbai)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Mumbai)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 23.2, 22.2, 21.8, 25.4, 26.5, 25.1, 26.0, 26.6, 25.1, 26.8, 25.6,…
## $ tmin <dbl> 17.0, 16.5, 16.3, 17.9, 19.3, 19.8, 18.9, 18.8, 19.0, 19.3, 18.5,…
## $ tmax <dbl> NA, 29.9, 30.7, 31.8, 33.7, 33.5, 33.7, 34.6, 34.4, 34.7, 34.0, 3…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Mumbai)
## time tavg tmin tmax prcp
## 1 01-01-1990 23.2 17.0 NA 0
## 2 02-01-1990 22.2 16.5 29.9 0
## 3 03-01-1990 21.8 16.3 30.7 0
## 4 04-01-1990 25.4 17.9 31.8 0
## 5 05-01-1990 26.5 19.3 33.7 0
## 6 06-01-1990 25.1 19.8 33.5 0
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Mumbai)
## time tavg tmin tmax prcp
## 11889 20-07-2022 27.4 25.0 30.5 11.9
## 11890 21-07-2022 27.6 25.6 30.5 10.9
## 11891 22-07-2022 28.3 26.0 30.5 3.0
## 11892 23-07-2022 28.2 25.8 31.3 5.1
## 11893 24-07-2022 28.1 25.6 30.4 7.1
## 11894 25-07-2022 28.3 25.1 30.2 7.1
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Mumbai)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Mumbai)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 23.2 22.2 21.8 25.4 26.5 25.1 26 26.6 25.1 26.8 ...
## $ tmin: num 17 16.5 16.3 17.9 19.3 19.8 18.9 18.8 19 19.3 ...
## $ tmax: num NA 29.9 30.7 31.8 33.7 33.5 33.7 34.6 34.4 34.7 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Mumbai)
## time tavg tmin tmax
## Length:11894 Min. :17.70 Min. : 8.50 Min. :22.30
## Class :character 1st Qu.:26.60 1st Qu.:19.80 1st Qu.:30.90
## Mode :character Median :28.10 Median :23.70 Median :32.40
## Mean :27.76 Mean :22.62 Mean :32.31
## 3rd Qu.:29.30 3rd Qu.:25.40 3rd Qu.:33.90
## Max. :33.70 Max. :30.40 Max. :41.30
## NA's :11 NA's :2454 NA's :1907
## prcp
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 10.94
## 3rd Qu.: 7.10
## Max. :461.00
## NA's :4681
sum(is.na(Weather_Mumbai))
## [1] 9053
## About 9053 entries are NA
Rajasthan_1990_2022_Jodhpur.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Jodhpur)
## [1] 11894 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Jodhpur)
## Rows: 11,894
## Columns: 5
## $ time <chr> "01-01-1990", "02-01-1990", "03-01-1990", "04-01-1990", "05-01-19…
## $ tavg <dbl> 22.9, 21.7, 21.0, 20.8, 20.4, 20.4, 18.8, 20.0, 21.0, 21.2, 21.8,…
## $ tmin <dbl> 19.1, NA, 16.4, NA, 14.2, 17.1, NA, 16.6, 15.5, 15.0, 16.0, 13.2,…
## $ tmax <dbl> 28.4, 26.5, 26.5, 27.4, 26.1, 24.2, 20.5, 25.1, NA, 27.7, 28.5, N…
## $ prcp <dbl> NA, 0, 0, 0, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Jodhpur)
## time tavg tmin tmax prcp
## 1 01-01-1990 22.9 19.1 28.4 NA
## 2 02-01-1990 21.7 NA 26.5 0
## 3 03-01-1990 21.0 16.4 26.5 0
## 4 04-01-1990 20.8 NA 27.4 0
## 5 05-01-1990 20.4 14.2 26.1 0
## 6 06-01-1990 20.4 17.1 24.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Jodhpur)
## time tavg tmin tmax prcp
## 11889 20-07-2022 24.9 19.8 30.8 0.0
## 11890 21-07-2022 23.7 20.5 30.8 82.5
## 11891 22-07-2022 23.2 21.1 27.9 0.0
## 11892 23-07-2022 23.1 20.9 26.7 0.0
## 11893 24-07-2022 22.8 20.0 26.7 0.3
## 11894 25-07-2022 24.1 20.2 28.5 0.5
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
all_columns <- names(Weather_Jodhpur)
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Jodhpur)
## 'data.frame': 11894 obs. of 5 variables:
## $ time: chr "01-01-1990" "02-01-1990" "03-01-1990" "04-01-1990" ...
## $ tavg: num 22.9 21.7 21 20.8 20.4 20.4 18.8 20 21 21.2 ...
## $ tmin: num 19.1 NA 16.4 NA 14.2 17.1 NA 16.6 15.5 15 ...
## $ tmax: num 28.4 26.5 26.5 27.4 26.1 24.2 20.5 25.1 NA 27.7 ...
## $ prcp: num NA 0 0 0 0 NA NA 0 0 0 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Jodhpur)
## time tavg tmin tmax
## Length:11894 Min. :17.20 Min. : 9.30 Min. :19.80
## Class :character 1st Qu.:22.30 1st Qu.:18.10 1st Qu.:27.90
## Mode :character Median :23.50 Median :19.80 Median :29.50
## Mean :23.84 Mean :19.39 Mean :29.93
## 3rd Qu.:25.20 3rd Qu.:20.80 3rd Qu.:32.00
## Max. :32.40 Max. :27.90 Max. :39.20
## NA's :70 NA's :1389 NA's :629
## prcp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 4.414
## 3rd Qu.: 2.000
## Max. :271.300
## NA's :4620
sum(is.na(Weather_Jodhpur))
## [1] 6708
## About 6708 entries are NA
weather_Bhubhneshwar_1990_2022.csv")
## Have a look at the data
#definitely has more columns than the cities that we have seen so far
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Bhubhneshwar)
## [1] 11935 11
#OK, so we have 11 columns, 6 more than others
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Bhubhneshwar)
## Rows: 11,935
## Columns: 11
## $ time <chr> "1990-01-01", "1990-01-02", "1990-01-03", "1990-01-04", "1990-01-…
## $ tavg <dbl> 20.1, 20.7, 20.7, 18.8, 19.8, 22.2, 20.8, 20.3, 22.3, 21.6, 21.7,…
## $ tmin <dbl> NA, 16.4, 16.0, NA, 11.0, 12.5, NA, 13.6, 14.8, 14.5, 15.6, 12.8,…
## $ tmax <dbl> 28.0, NA, 27.4, 28.0, 28.2, NA, NA, 29.5, 31.6, 30.8, 30.7, 29.3,…
## $ prcp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, 0, …
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wspd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Bhubhneshwar)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 1990-01-01 20.1 NA 28.0 0 NA NA NA NA NA NA
## 2 1990-01-02 20.7 16.4 NA 0 NA NA NA NA NA NA
## 3 1990-01-03 20.7 16.0 27.4 0 NA NA NA NA NA NA
## 4 1990-01-04 18.8 NA 28.0 0 NA NA NA NA NA NA
## 5 1990-01-05 19.8 11.0 28.2 0 NA NA NA NA NA NA
## 6 1990-01-06 22.2 12.5 NA 0 NA NA NA NA NA NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Bhubhneshwar)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 11930 2022-08-30 30.0 27.0 34.0 1.2 NA 169 8.3 NA 1007.6 NA
## 11931 2022-08-31 29.2 26.3 33.0 9.0 NA 186 8.2 NA 1006.6 NA
## 11932 2022-09-01 29.6 27.0 33.0 2.1 NA 190 9.5 NA 1006.8 NA
## 11933 2022-09-02 29.7 26.3 33.0 3.3 NA 198 9.5 NA 1007.3 NA
## 11934 2022-09-03 29.2 26.1 34.0 9.7 NA 215 8.5 NA 1005.5 NA
## 11935 2022-09-04 27.6 25.9 31.6 12.8 NA 214 8.6 NA 1004.9 NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Bhubhneshwar)
## [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Bhubhneshwar)
## 'data.frame': 11935 obs. of 11 variables:
## $ time: chr "1990-01-01" "1990-01-02" "1990-01-03" "1990-01-04" ...
## $ tavg: num 20.1 20.7 20.7 18.8 19.8 22.2 20.8 20.3 22.3 21.6 ...
## $ tmin: num NA 16.4 16 NA 11 12.5 NA 13.6 14.8 14.5 ...
## $ tmax: num 28 NA 27.4 28 28.2 NA NA 29.5 31.6 30.8 ...
## $ prcp: num 0 0 0 0 0 0 0 0 0 0 ...
## $ snow: logi NA NA NA NA NA NA ...
## $ wdir: num NA NA NA NA NA NA NA NA NA NA ...
## $ wspd: num NA NA NA NA NA NA NA NA NA NA ...
## $ wpgt: logi NA NA NA NA NA NA ...
## $ pres: num NA NA NA NA NA NA NA NA NA NA ...
## $ tsun: logi NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Bhubhneshwar)
## time tavg tmin tmax
## Length:11935 Min. :15.70 Min. : 8.20 Min. :19.4
## Class :character 1st Qu.:24.70 1st Qu.:19.00 1st Qu.:30.4
## Mode :character Median :27.70 Median :24.00 Median :32.8
## Mean :26.99 Mean :22.24 Mean :33.0
## 3rd Qu.:29.40 3rd Qu.:25.60 3rd Qu.:35.4
## Max. :37.40 Max. :31.80 Max. :46.7
## NA's :78 NA's :2090 NA's :891
## prcp snow wdir wspd
## Min. : 0.000 Mode:logical Min. : 0.0 Min. : 0.500
## 1st Qu.: 0.000 NA's:11935 1st Qu.: 89.0 1st Qu.: 4.500
## Median : 0.000 Median :188.0 Median : 7.000
## Mean : 7.074 Mean :169.1 Mean : 8.399
## 3rd Qu.: 4.100 3rd Qu.:220.8 3rd Qu.:11.000
## Max. :470.900 Max. :359.0 Max. :33.100
## NA's :5097 NA's :10641 NA's :9806
## wpgt pres tsun
## Mode:logical Min. : 990.6 Mode:logical
## NA's:11935 1st Qu.:1002.9 NA's:11935
## Median :1007.3
## Mean :1007.4
## 3rd Qu.:1012.4
## Max. :1019.3
## NA's :10692
sum(is.na(Weather_Bhubhneshwar))
## [1] 75100
## About 75100 entries are NA
weather_Rourkela_2021_2022.csv")
## Have a look at the data
#definitely has more columns than the cities that we have seen so far
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Weather_Rourkela)
## [1] 426 11
#OK, so we have 11 columns, 6 more than others
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Weather_Rourkela)
## Rows: 426
## Columns: 11
## $ time <chr> "2021-07-06", "2021-07-07", "2021-07-08", "2021-07-09", "2021-07-…
## $ tavg <dbl> 29.3, 29.7, 27.4, 28.5, 29.0, 29.3, 28.9, 28.6, 29.0, 29.5, 29.6,…
## $ tmin <dbl> 26.2, 27.3, 25.8, 26.1, 26.2, 26.2, 25.7, 25.5, 25.4, 25.5, 26.3,…
## $ tmax <dbl> 32.6, 33.4, 29.7, 32.1, 32.6, 33.7, 32.9, 32.5, 32.7, 33.4, 33.2,…
## $ prcp <dbl> NA, 11.1, 66.9, 11.4, 2.7, 10.8, 5.4, 10.1, 1.9, 1.3, 1.1, 6.0, 8…
## $ snow <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wdir <dbl> 197, 199, 186, 173, 121, 70, 95, 101, 138, 152, 179, 181, 181, 19…
## $ wspd <dbl> 6.8, 6.9, 6.3, 3.9, 4.6, 5.8, 7.0, 5.5, 6.5, 8.7, 9.5, 8.3, 8.0, …
## $ wpgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ pres <dbl> 1002.5, 1002.2, 1001.8, 1001.0, 1000.9, 1002.2, 1003.4, 1002.8, 1…
## $ tsun <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
print("Lets see the head of the dataset")
## [1] "Lets see the head of the dataset"
head(Weather_Rourkela)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 1 2021-07-06 29.3 26.2 32.6 NA NA 197 6.8 NA 1002.5 NA
## 2 2021-07-07 29.7 27.3 33.4 11.1 NA 199 6.9 NA 1002.2 NA
## 3 2021-07-08 27.4 25.8 29.7 66.9 NA 186 6.3 NA 1001.8 NA
## 4 2021-07-09 28.5 26.1 32.1 11.4 NA 173 3.9 NA 1001.0 NA
## 5 2021-07-10 29.0 26.2 32.6 2.7 NA 121 4.6 NA 1000.9 NA
## 6 2021-07-11 29.3 26.2 33.7 10.8 NA 70 5.8 NA 1002.2 NA
print("Lets see the tail of the dataset")
## [1] "Lets see the tail of the dataset"
tail(Weather_Rourkela)
## time tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
## 421 2022-08-30 29.8 26.4 34.3 0.0 NA 174 7.6 NA 1007.9 NA
## 422 2022-08-31 29.0 26.6 33.5 2.0 NA 187 8.6 NA 1006.8 NA
## 423 2022-09-01 29.1 25.7 33.2 11.5 NA 205 6.7 NA 1007.2 NA
## 424 2022-09-02 29.4 26.4 33.7 1.5 NA 189 7.0 NA 1007.5 NA
## 425 2022-09-03 28.7 26.6 32.6 8.0 NA 203 8.0 NA 1005.8 NA
## 426 2022-09-04 28.2 25.9 31.8 17.7 NA 211 6.8 NA 1004.8 NA
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Weather_Rourkela)
## [1] "time" "tavg" "tmin" "tmax" "prcp" "snow" "wdir" "wspd" "wpgt" "pres"
## [11] "tsun"
## So the additional columns are: snow, wind direction, wind speed, wind pgt, pressure and tsunami
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Weather_Rourkela)
## 'data.frame': 426 obs. of 11 variables:
## $ time: chr "2021-07-06" "2021-07-07" "2021-07-08" "2021-07-09" ...
## $ tavg: num 29.3 29.7 27.4 28.5 29 29.3 28.9 28.6 29 29.5 ...
## $ tmin: num 26.2 27.3 25.8 26.1 26.2 26.2 25.7 25.5 25.4 25.5 ...
## $ tmax: num 32.6 33.4 29.7 32.1 32.6 33.7 32.9 32.5 32.7 33.4 ...
## $ prcp: num NA 11.1 66.9 11.4 2.7 10.8 5.4 10.1 1.9 1.3 ...
## $ snow: logi NA NA NA NA NA NA ...
## $ wdir: num 197 199 186 173 121 70 95 101 138 152 ...
## $ wspd: num 6.8 6.9 6.3 3.9 4.6 5.8 7 5.5 6.5 8.7 ...
## $ wpgt: logi NA NA NA NA NA NA ...
## $ pres: num 1002 1002 1002 1001 1001 ...
## $ tsun: logi NA NA NA NA NA NA ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Weather_Rourkela)
## time tavg tmin tmax
## Length:426 Min. :14.60 Min. : 8.20 Min. :21.50
## Class :character 1st Qu.:24.40 1st Qu.:18.18 1st Qu.:29.60
## Mode :character Median :28.10 Median :25.20 Median :32.10
## Mean :26.71 Mean :22.30 Mean :32.25
## 3rd Qu.:29.30 3rd Qu.:26.10 3rd Qu.:33.80
## Max. :35.00 Max. :29.30 Max. :43.60
## NA's :2 NA's :2 NA's :2
## prcp snow wdir wspd
## Min. : 0.000 Mode:logical Min. : 0.0 Min. : 2.900
## 1st Qu.: 0.000 NA's:426 1st Qu.: 49.0 1st Qu.: 5.500
## Median : 0.200 Median :168.0 Median : 6.600
## Mean : 5.695 Mean :140.3 Mean : 7.441
## 3rd Qu.: 7.200 3rd Qu.:195.2 3rd Qu.: 8.725
## Max. :123.000 Max. :359.0 Max. :20.400
## NA's :3 NA's :2 NA's :2
## wpgt pres tsun
## Mode:logical Min. : 993.1 Mode:logical
## NA's:426 1st Qu.:1002.5 NA's:426
## Median :1005.5
## Mean :1006.8
## 3rd Qu.:1012.1
## Max. :1020.6
## NA's :2
sum(is.na(Weather_Rourkela))
## [1] 1293
## About 1293 entries are NA
AQI stations: stations.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_stations)
## [1] 230 5
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_stations)
## Rows: 230
## Columns: 5
## $ StationId <chr> "AP001", "AP002", "AP003", "AP004", "AP005", "AS001", "BR0…
## $ StationName <chr> "Secretariat, Amaravati - APPCB", "Anand Kala Kshetram, Ra…
## $ City <chr> "Amaravati", "Rajamahendravaram", "Tirupati", "Vijayawada"…
## $ State <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ Status <chr> "Active", "", "", "", "Active", "Active", "", "", "", "", …
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_stations)
## [1] "StationId" "StationName" "City" "State" "Status"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_stations)
## 'data.frame': 230 obs. of 5 variables:
## $ StationId : chr "AP001" "AP002" "AP003" "AP004" ...
## $ StationName: chr "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
## $ City : chr "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
## $ State : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ Status : chr "Active" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_stations)
## StationId StationName City State
## Length:230 Length:230 Length:230 Length:230
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Status
## Length:230
## Class :character
## Mode :character
attach(AQ_stations)
AQ_stations [AQ_stations == ""] <- NA
## There is no records with NA but there are records with missing data.
## Lets fill them with NA and then find it.
AQ_stations[is.na(Status),]
## [1] StationId StationName City State Status
## <0 rows> (or 0-length row.names)
AQI Station Hour wise - station_hour.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_hour)
## [1] 2589083 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_hour)
## Rows: 2,589,083
## Columns: 16
## $ StationId <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Datetime <chr> "2017-11-24 17:00:00", "2017-11-24 18:00:00", "2017-11-24 1…
## $ PM2.5 <dbl> 60.50, 65.50, 80.00, 81.50, 75.25, 69.25, 67.50, 68.00, 73.…
## $ PM10 <dbl> 98.00, 111.25, 132.00, 133.25, 116.00, 108.25, 111.50, 111.…
## $ NO <dbl> 2.35, 2.70, 2.10, 1.95, 1.43, 0.70, 1.05, 1.25, 0.30, 0.80,…
## $ NO2 <dbl> 30.80, 24.20, 25.18, 16.25, 17.48, 18.47, 12.15, 14.12, 14.…
## $ NOx <dbl> 18.25, 15.07, 15.15, 10.23, 10.43, 10.38, 7.30, 8.50, 7.90,…
## $ NH3 <dbl> 8.50, 9.77, 12.02, 11.58, 12.03, 13.80, 17.65, 20.28, 11.50…
## $ CO <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.1, 0.1,…
## $ SO2 <dbl> 11.85, 13.17, 12.08, 10.47, 9.12, 9.25, 9.40, 8.90, 11.80, …
## $ O3 <dbl> 126.40, 117.12, 98.98, 112.20, 106.35, 91.10, 112.70, 116.1…
## $ Benzene <dbl> 0.10, 0.10, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.20, 0.23,…
## $ Toluene <dbl> 6.10, 6.25, 5.98, 6.72, 5.75, 5.02, 5.60, 5.55, 6.60, 6.77,…
## $ Xylene <dbl> 0.10, 0.15, 0.18, 0.10, 0.08, 0.00, 0.10, 0.05, 0.00, 0.10,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_hour)
## [1] "StationId" "Datetime" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_hour)
## 'data.frame': 2589083 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Datetime : chr "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
## $ PM2.5 : num 60.5 65.5 80 81.5 75.2 ...
## $ PM10 : num 98 111 132 133 116 ...
## $ NO : num 2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
## $ NO2 : num 30.8 24.2 25.2 16.2 17.5 ...
## $ NOx : num 18.2 15.1 15.2 10.2 10.4 ...
## $ NH3 : num 8.5 9.77 12.02 11.58 12.03 ...
## $ CO : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
## $ SO2 : num 11.85 13.17 12.08 10.47 9.12 ...
## $ O3 : num 126 117 99 112 106 ...
## $ Benzene : num 0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
## $ Toluene : num 6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
## $ Xylene : num 0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_hour)
## StationId Datetime PM2.5 PM10
## Length:2589083 Length:2589083 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 28.2 1st Qu.: 64.0
## Mode :character Mode :character Median : 52.6 Median : 116.2
## Mean : 80.9 Mean : 158.5
## 3rd Qu.: 97.7 3rd Qu.: 204.0
## Max. :1000.0 Max. :1000.0
## NA's :647689 NA's :1119252
## NO NO2 NOx NH3
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 3.0 1st Qu.: 13.1 1st Qu.: 11.3 1st Qu.: 11.2
## Median : 7.2 Median : 24.8 Median : 22.9 Median : 22.4
## Mean : 22.8 Mean : 35.2 Mean : 40.6 Mean : 28.7
## 3rd Qu.: 18.6 3rd Qu.: 45.5 3rd Qu.: 45.7 3rd Qu.: 37.8
## Max. :500.0 Max. :500.0 Max. :500.0 Max. :500.0
## NA's :553711 NA's :528973 NA's :490808 NA's :1236618
## CO SO2 O3 Benzene
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.4 1st Qu.: 4.2 1st Qu.: 11.0 1st Qu.: 0.1
## Median : 0.8 Median : 8.2 Median : 24.8 Median : 1.0
## Mean : 1.5 Mean : 12.1 Mean : 38.1 Mean : 3.3
## 3rd Qu.: 1.4 3rd Qu.: 14.5 3rd Qu.: 49.5 3rd Qu.: 3.2
## Max. :498.6 Max. :200.0 Max. :997.0 Max. :498.1
## NA's :499302 NA's :742737 NA's :725973 NA's :861579
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.0 Min. : 0.0 Min. : 5.0 Length:2589083
## 1st Qu.: 0.3 1st Qu.: 0.0 1st Qu.: 84.0 Class :character
## Median : 3.4 Median : 0.2 Median : 131.0 Mode :character
## Mean : 14.9 Mean : 2.4 Mean : 180.2
## 3rd Qu.: 15.1 3rd Qu.: 1.8 3rd Qu.: 259.0
## Max. :500.0 Max. :500.0 Max. :3133.0
## NA's :1042366 NA's :2075104 NA's :570190
attach(AQ_station_hour)
## The following object is masked from AQ_stations:
##
## StationId
AQ_station_hour [AQ_station_hour == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5:647689 PM10:1119252 NO:553711 NO2:528973 NOx:490808 CO:1236618
## SO2:499302 O3:742737 Benzene:725973 Toluene:861579 Xylene:1042366
AQ_station_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 152113
## 2 Moderate 675008
## 3 Poor 239990
## 4 Satisfactory 530164
## 5 Severe 120468
## 6 Very Poor 301150
## 7 <NA> 570190
## Looks like Moderate entries are the highest ones but second highest is NA entries...
AQ_station_day - station_day.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_station_day)
## [1] 108035 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_station_day)
## Rows: 108,035
## Columns: 16
## $ StationId <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP00…
## $ Date <chr> "2017-11-24", "2017-11-25", "2017-11-26", "2017-11-27", "20…
## $ PM2.5 <dbl> 71.36, 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.…
## $ PM10 <dbl> 115.75, 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113…
## $ NO <dbl> 1.75, 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97,…
## $ NO2 <dbl> 20.65, 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.…
## $ NOx <dbl> 12.40, 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.…
## $ NH3 <dbl> 12.19, 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.…
## $ CO <dbl> 0.10, 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15,…
## $ SO2 <dbl> 10.76, 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.…
## $ O3 <dbl> 109.26, 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123…
## $ Benzene <dbl> 0.17, 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23,…
## $ Toluene <dbl> 5.92, 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82,…
## $ Xylene <dbl> 0.10, 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04,…
## $ AQI <dbl> NA, 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, …
## $ AQI_Bucket <chr> "", "Moderate", "Moderate", "Moderate", "Moderate", "Modera…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_station_day)
## [1] "StationId" "Date" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_station_day)
## 'data.frame': 108035 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Date : chr "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
## $ PM2.5 : num 71.4 81.4 78.3 88.8 64.2 ...
## $ PM10 : num 116 124 129 135 104 ...
## $ NO : num 1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
## $ NO2 : num 20.6 20.5 26 30.9 28.1 ...
## $ NOx : num 12.4 12.1 14.8 21.8 17 ...
## $ NH3 : num 12.2 10.7 10.3 12.9 11.4 ...
## $ CO : num 0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
## $ SO2 : num 10.8 15.2 27 33.6 19 ...
## $ O3 : num 109 127 117 112 138 ...
## $ Benzene : num 0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
## $ Toluene : num 5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
## $ Xylene : num 0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
## $ AQI : num NA 184 197 198 188 173 165 191 191 227 ...
## $ AQI_Bucket: chr "" "Moderate" "Moderate" "Moderate" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_station_day)
## StationId Date PM2.5 PM10
## Length:108035 Length:108035 Min. : 0.02 Min. : 0.01
## Class :character Class :character 1st Qu.: 31.88 1st Qu.: 70.15
## Mode :character Mode :character Median : 55.95 Median : 122.09
## Mean : 80.27 Mean : 157.97
## 3rd Qu.: 99.92 3rd Qu.: 208.67
## Max. :1000.00 Max. :1000.00
## NA's :21625 NA's :42706
## NO NO2 NOx NH3
## Min. : 0.01 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 4.84 1st Qu.: 15.09 1st Qu.: 13.97 1st Qu.: 11.90
## Median : 10.29 Median : 27.21 Median : 26.66 Median : 23.59
## Mean : 23.12 Mean : 35.24 Mean : 41.20 Mean : 28.73
## 3rd Qu.: 24.98 3rd Qu.: 46.93 3rd Qu.: 50.50 3rd Qu.: 38.14
## Max. :470.00 Max. :448.05 Max. :467.63 Max. :418.90
## NA's :17106 NA's :16547 NA's :15500 NA's :48105
## CO SO2 O3 Benzene
## Min. : 0.000 Min. : 0.01 Min. : 0.01 Min. : 0.000
## 1st Qu.: 0.530 1st Qu.: 5.04 1st Qu.: 18.89 1st Qu.: 0.160
## Median : 0.910 Median : 8.95 Median : 30.84 Median : 1.210
## Mean : 1.606 Mean : 12.26 Mean : 38.13 Mean : 3.358
## 3rd Qu.: 1.450 3rd Qu.: 14.92 3rd Qu.: 47.14 3rd Qu.: 3.610
## Max. :175.810 Max. :195.65 Max. :963.00 Max. :455.030
## NA's :12998 NA's :25204 NA's :25568 NA's :31455
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.00 Min. : 0.00 Min. : 8.0 Length:108035
## 1st Qu.: 0.69 1st Qu.: 0.00 1st Qu.: 86.0 Class :character
## Median : 4.33 Median : 0.40 Median : 132.0 Mode :character
## Mean : 15.35 Mean : 2.42 Mean : 179.7
## 3rd Qu.: 17.51 3rd Qu.: 2.11 3rd Qu.: 254.0
## Max. :454.85 Max. :170.37 Max. :2049.0
## NA's :38702 NA's :85137 NA's :21010
attach(AQ_station_day)
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, StationId, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## StationId
AQ_station_day [AQ_station_day == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 21625 PM10: 42706 NO: 17106 NO2: 16547 NOx: 15500 NH3: 48105
## CO: 12998 SO2: 25204 O3: 25568 Benzene: 31455 Toluene: 38702 Xylene: 85137
AQ_station_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 5510
## 2 Moderate 29417
## 3 Poor 11493
## 4 Satisfactory 23636
## 5 Severe 5207
## 6 Very Poor 11762
## 7 <NA> 21010
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
AQ_city_day <- read.csv(“./datasets/city_day.csv”)
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_day)
## [1] 29531 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_day)
## Rows: 29,531
## Columns: 16
## $ City <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Date <chr> "2015-01-01", "2015-01-02", "2015-01-03", "2015-01-04", "20…
## $ PM2.5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ NO2 <dbl> 18.22, 15.69, 19.30, 18.48, 21.42, 38.48, 40.62, 36.74, 31.…
## $ NOx <dbl> 17.15, 16.46, 29.70, 17.97, 37.76, 81.50, 130.77, 96.75, 48…
## $ NH3 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO <dbl> 0.92, 0.97, 17.40, 1.70, 22.10, 45.41, 112.16, 80.87, 29.16…
## $ SO2 <dbl> 27.64, 24.55, 29.07, 18.59, 39.33, 45.76, 32.28, 38.54, 58.…
## $ O3 <dbl> 133.36, 34.06, 30.70, 36.08, 39.31, 46.51, 33.47, 31.89, 25…
## $ Benzene <dbl> 0.00, 3.68, 6.80, 4.43, 7.01, 5.42, 0.00, 0.00, 0.00, 0.00,…
## $ Toluene <dbl> 0.02, 5.50, 16.40, 10.14, 18.89, 10.83, 0.00, 0.00, 0.00, 0…
## $ Xylene <dbl> 0.00, 3.77, 2.25, 1.00, 2.78, 1.93, 0.00, 0.00, 0.00, 0.00,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_day)
## [1] "City" "Date" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_day)
## 'data.frame': 29531 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Date : chr "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ NO2 : num 18.2 15.7 19.3 18.5 21.4 ...
## $ NOx : num 17.1 16.5 29.7 18 37.8 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ SO2 : num 27.6 24.6 29.1 18.6 39.3 ...
## $ O3 : num 133.4 34.1 30.7 36.1 39.3 ...
## $ Benzene : num 0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
## $ Toluene : num 0.02 5.5 16.4 10.14 18.89 ...
## $ Xylene : num 0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_day)
## City Date PM2.5 PM10
## Length:29531 Length:29531 Min. : 0.04 Min. : 0.01
## Class :character Class :character 1st Qu.: 28.82 1st Qu.: 56.26
## Mode :character Mode :character Median : 48.57 Median : 95.68
## Mean : 67.45 Mean : 118.13
## 3rd Qu.: 80.59 3rd Qu.: 149.75
## Max. :949.99 Max. :1000.00
## NA's :4598 NA's :11140
## NO NO2 NOx NH3
## Min. : 0.02 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 5.63 1st Qu.: 11.75 1st Qu.: 12.82 1st Qu.: 8.58
## Median : 9.89 Median : 21.69 Median : 23.52 Median : 15.85
## Mean : 17.57 Mean : 28.56 Mean : 32.31 Mean : 23.48
## 3rd Qu.: 19.95 3rd Qu.: 37.62 3rd Qu.: 40.13 3rd Qu.: 30.02
## Max. :390.68 Max. :362.21 Max. :467.63 Max. :352.89
## NA's :3582 NA's :3585 NA's :4185 NA's :10328
## CO SO2 O3 Benzene
## Min. : 0.000 Min. : 0.01 Min. : 0.01 Min. : 0.000
## 1st Qu.: 0.510 1st Qu.: 5.67 1st Qu.: 18.86 1st Qu.: 0.120
## Median : 0.890 Median : 9.16 Median : 30.84 Median : 1.070
## Mean : 2.249 Mean : 14.53 Mean : 34.49 Mean : 3.281
## 3rd Qu.: 1.450 3rd Qu.: 15.22 3rd Qu.: 45.57 3rd Qu.: 3.080
## Max. :175.810 Max. :193.86 Max. :257.73 Max. :455.030
## NA's :2059 NA's :3854 NA's :4022 NA's :5623
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.000 Min. : 0.00 Min. : 13.0 Length:29531
## 1st Qu.: 0.600 1st Qu.: 0.14 1st Qu.: 81.0 Class :character
## Median : 2.970 Median : 0.98 Median : 118.0 Mode :character
## Mean : 8.701 Mean : 3.07 Mean : 166.5
## 3rd Qu.: 9.150 3rd Qu.: 3.35 3rd Qu.: 208.0
## Max. :454.850 Max. :170.37 Max. :2049.0
## NA's :8041 NA's :18109 NA's :4681
attach(AQ_city_day)
## The following objects are masked from AQ_station_day:
##
## AQI, AQI_Bucket, Benzene, CO, Date, NH3, NO, NO2, NOx, O3, PM10,
## PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## City
AQ_city_day [AQ_city_day == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 4598 PM10: 11140 NO: 3582 NO2: 3585 NOx: 4185 NH3: 10328
## CO: 2059 SO2: 3854 O3: 4022 Benzene: 5623 Toluene: 8041 Xylene: 18109
AQ_city_day %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 1341
## 2 Moderate 8829
## 3 Poor 2781
## 4 Satisfactory 8224
## 5 Severe 1338
## 6 Very Poor 2337
## 7 <NA> 4681
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
AQ_city_hour: city_hour.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(AQ_city_hour)
## [1] 707875 16
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(AQ_city_hour)
## Rows: 707,875
## Columns: 16
## $ City <chr> "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmedabad", "Ahmeda…
## $ Datetime <chr> "2015-01-01 01:00:00", "2015-01-01 02:00:00", "2015-01-01 0…
## $ PM2.5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ PM10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ NO <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ NO2 <dbl> 40.01, 27.75, 19.32, 16.45, 14.90, 15.95, 15.94, 16.66, 16.…
## $ NOx <dbl> 36.37, 19.73, 11.08, 9.20, 7.85, 10.82, 12.47, 16.48, 18.02…
## $ NH3 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CO <dbl> 1.00, 0.02, 0.08, 0.30, 0.12, 0.33, 0.45, 1.03, 1.47, 2.05,…
## $ SO2 <dbl> 122.07, 85.90, 52.83, 39.53, 32.63, 29.87, 27.41, 20.92, 16…
## $ O3 <dbl> NA, NA, NA, 153.58, NA, 64.25, 191.96, 177.21, 122.08, NA, …
## $ Benzene <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Toluene <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ Xylene <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AQI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ AQI_Bucket <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(AQ_city_hour)
## [1] "City" "Datetime" "PM2.5" "PM10" "NO"
## [6] "NO2" "NOx" "NH3" "CO" "SO2"
## [11] "O3" "Benzene" "Toluene" "Xylene" "AQI"
## [16] "AQI_Bucket"
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(AQ_city_hour)
## 'data.frame': 707875 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Datetime : chr "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ NO2 : num 40 27.8 19.3 16.4 14.9 ...
## $ NOx : num 36.37 19.73 11.08 9.2 7.85 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ SO2 : num 122.1 85.9 52.8 39.5 32.6 ...
## $ O3 : num NA NA NA 154 NA ...
## $ Benzene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Toluene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Xylene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(AQ_city_hour)
## City Datetime PM2.5 PM10
## Length:707875 Length:707875 Min. : 0.01 Min. : 0.01
## Class :character Class :character 1st Qu.: 26.20 1st Qu.: 52.38
## Mode :character Mode :character Median : 46.42 Median : 91.50
## Mean : 67.62 Mean : 119.08
## 3rd Qu.: 79.49 3rd Qu.: 147.52
## Max. : 999.99 Max. :1000.00
## NA's :145088 NA's :296737
## NO NO2 NOx NH3
## Min. : 0.01 Min. : 0.01 Min. : 0.00 Min. : 0.01
## 1st Qu.: 3.84 1st Qu.: 10.81 1st Qu.: 10.66 1st Qu.: 8.12
## Median : 7.96 Median : 20.32 Median : 20.79 Median : 15.38
## Mean : 17.42 Mean : 28.89 Mean : 32.29 Mean : 23.61
## 3rd Qu.: 16.15 3rd Qu.: 36.35 3rd Qu.: 37.15 3rd Qu.: 29.23
## Max. :499.99 Max. :499.51 Max. :498.61 Max. :499.97
## NA's :116632 NA's :117122 NA's :123224 NA's :272542
## CO SO2 O3 Benzene
## Min. : 0.00 Min. : 0.01 Min. : 0.01 Min. : 0.00
## 1st Qu.: 0.42 1st Qu.: 4.88 1st Qu.: 13.42 1st Qu.: 0.05
## Median : 0.80 Median : 8.37 Median : 26.24 Median : 0.86
## Mean : 2.18 Mean : 14.04 Mean : 34.80 Mean : 3.09
## 3rd Qu.: 1.37 3rd Qu.: 14.78 3rd Qu.: 47.62 3rd Qu.: 2.75
## Max. :498.57 Max. :199.96 Max. :497.62 Max. :498.07
## NA's :86517 NA's :130373 NA's :129208 NA's :163646
## Toluene Xylene AQI AQI_Bucket
## Min. : 0.00 Min. : 0.0 Min. : 8.0 Length:707875
## 1st Qu.: 0.37 1st Qu.: 0.1 1st Qu.: 79.0 Class :character
## Median : 2.59 Median : 0.8 Median : 116.0 Mode :character
## Mean : 8.66 Mean : 3.1 Mean : 166.4
## 3rd Qu.: 8.41 3rd Qu.: 3.1 3rd Qu.: 208.0
## Max. :499.40 Max. :500.0 Max. :3133.0
## NA's :220607 NA's :455829 NA's :129080
attach(AQ_city_hour)
## The following objects are masked from AQ_city_day:
##
## AQI, AQI_Bucket, Benzene, CO, City, NH3, NO, NO2, NOx, O3, PM10,
## PM2.5, SO2, Toluene, Xylene
## The following objects are masked from AQ_station_day:
##
## AQI, AQI_Bucket, Benzene, CO, NH3, NO, NO2, NOx, O3, PM10, PM2.5,
## SO2, Toluene, Xylene
## The following objects are masked from AQ_station_hour:
##
## AQI, AQI_Bucket, Benzene, CO, Datetime, NH3, NO, NO2, NOx, O3,
## PM10, PM2.5, SO2, Toluene, Xylene
## The following object is masked from AQ_stations:
##
## City
AQ_city_hour [AQ_city_hour == ""] <- NA
## So the air quality seems to be dependent on 12 parameters
## There are too many NAs/missing data amongst them:
## PM2.5: 145088 PM10: 296737 NO: 116632 NO2: 117122 NOx: 123224 NH3: 272542
## CO: 86517 SO2: 130373 O3: 129208 Benzene: 163646 Toluene: 220607 Xylene: 455829
AQ_city_hour %>% group_by(AQI_Bucket)%>%count()
## # A tibble: 7 × 2
## # Groups: AQI_Bucket [7]
## AQI_Bucket n
## <chr> <int>
## 1 Good 38611
## 2 Moderate 198991
## 3 Poor 66654
## 4 Satisfactory 189434
## 5 Severe 27650
## 6 Very Poor 57455
## 7 <NA> 129080
## Looks like Moderate entries are the highest ones, followed by Satisfactory
## but third highest is NA entries...
Airport_delay: Aiport_Delay.csv
## Have a look at the data
print ("The dimensions of the dataset")
## [1] "The dimensions of the dataset"
dim(Airport_delay)
## [1] 14952 22
print("Lets have a glimpse of the dataset")
## [1] "Lets have a glimpse of the dataset"
glimpse(Airport_delay)
## Rows: 14,952
## Columns: 22
## $ Date <chr> "28-1-18", "28-1-18", …
## $ Departure.Airport <chr> "BLR", "CCU", "DEL", "…
## $ Departure.Airport.Rating..out.of.10. <dbl> NA, NA, 7.99, 7.29, NA…
## $ Departure.Airport.On.Time.Rating..out.of.10. <dbl> NA, NA, 7.3, 6.2, NA, …
## $ Departure.Airport.Service.Rating..out.of.10. <dbl> NA, NA, 9.1, 9.0, NA, …
## $ Arrival.Airport <chr> "DEL", "DEL", "HYD", "…
## $ Arrival.Airport.Rating..out.of.10. <dbl> 7.99, 7.99, 8.27, 7.99…
## $ Arrival.Airport.On.Time.Rating..out.of.10. <dbl> 7.3, 7.3, 7.8, 7.3, 6.…
## $ Arrival.Airport.Service.Rating..out.of.10. <dbl> 9.1, 9.1, 9.0, 9.1, 9.…
## $ Airplane.Type <chr> "", "", "", "", "", "A…
## $ Expected.Departure.Time <chr> "6:10", "7:00", "7:05"…
## $ Departure.Time <chr> "6:10", "7:01", "7:33"…
## $ Departure.Delay <chr> "0:00:00", "0:01:00", …
## $ Duration <chr> "2:20", "2:09", "1:46"…
## $ Expected.Arrival.Time <chr> "8:55", "9:10", "9:10"…
## $ Arrival.Time <chr> "8:30", "9:10", "9:19"…
## $ Arrival.Time.Delay <chr> "-0:25:00", "0:00:00",…
## $ Carrier <chr> "Air India", "Air Indi…
## $ Carrier.Rating..out.of.10. <dbl> 6.6, 6.6, 6.6, 6.6, 6.…
## $ Carrier.Market.Share..out.of.100. <dbl> 12.0, 12.0, 12.0, 12.0…
## $ Carrier.Load.Factor..out.of.100. <dbl> 80.75, 80.75, 80.75, 8…
## $ Carrier.On.Time.Performance.Rating..out.of.100. <dbl> 70.3, 70.3, 70.3, 70.3…
print("Lets find the column names of the dataset")
## [1] "Lets find the column names of the dataset"
names(Airport_delay)
## [1] "Date"
## [2] "Departure.Airport"
## [3] "Departure.Airport.Rating..out.of.10."
## [4] "Departure.Airport.On.Time.Rating..out.of.10."
## [5] "Departure.Airport.Service.Rating..out.of.10."
## [6] "Arrival.Airport"
## [7] "Arrival.Airport.Rating..out.of.10."
## [8] "Arrival.Airport.On.Time.Rating..out.of.10."
## [9] "Arrival.Airport.Service.Rating..out.of.10."
## [10] "Airplane.Type"
## [11] "Expected.Departure.Time"
## [12] "Departure.Time"
## [13] "Departure.Delay"
## [14] "Duration"
## [15] "Expected.Arrival.Time"
## [16] "Arrival.Time"
## [17] "Arrival.Time.Delay"
## [18] "Carrier"
## [19] "Carrier.Rating..out.of.10."
## [20] "Carrier.Market.Share..out.of.100."
## [21] "Carrier.Load.Factor..out.of.100."
## [22] "Carrier.On.Time.Performance.Rating..out.of.100."
print("Lets find the structure of the dataset")
## [1] "Lets find the structure of the dataset"
str(Airport_delay)
## 'data.frame': 14952 obs. of 22 variables:
## $ Date : chr "28-1-18" "28-1-18" "28-1-18" "28-1-18" ...
## $ Departure.Airport : chr "BLR" "CCU" "DEL" "BOM" ...
## $ Departure.Airport.Rating..out.of.10. : num NA NA 7.99 7.29 NA 7.99 NA NA 7.99 NA ...
## $ Departure.Airport.On.Time.Rating..out.of.10. : num NA NA 7.3 6.2 NA 7.3 NA NA 7.3 NA ...
## $ Departure.Airport.Service.Rating..out.of.10. : num NA NA 9.1 9 NA 9.1 NA NA 9.1 NA ...
## $ Arrival.Airport : chr "DEL" "DEL" "HYD" "DEL" ...
## $ Arrival.Airport.Rating..out.of.10. : num 7.99 7.99 8.27 7.99 7.29 8.27 7.29 7.99 8.27 7.29 ...
## $ Arrival.Airport.On.Time.Rating..out.of.10. : num 7.3 7.3 7.8 7.3 6.2 7.8 6.2 7.3 7.8 6.2 ...
## $ Arrival.Airport.Service.Rating..out.of.10. : num 9.1 9.1 9 9.1 9 9 9 9.1 9 9 ...
## $ Airplane.Type : chr "" "" "" "" ...
## $ Expected.Departure.Time : chr "6:10" "7:00" "7:05" "7:00" ...
## $ Departure.Time : chr "6:10" "7:01" "7:33" "7:07" ...
## $ Departure.Delay : chr "0:00:00" "0:01:00" "0:28:00" "0:07:00" ...
## $ Duration : chr "2:20" "2:09" "1:46" "1:40" ...
## $ Expected.Arrival.Time : chr "8:55" "9:10" "9:10" "9:05" ...
## $ Arrival.Time : chr "8:30" "9:10" "9:19" "8:47" ...
## $ Arrival.Time.Delay : chr "-0:25:00" "0:00:00" "0:09:00" "-0:18:00" ...
## $ Carrier : chr "Air India" "Air India" "Air India" "Air India" ...
## $ Carrier.Rating..out.of.10. : num 6.6 6.6 6.6 6.6 6.6 7.2 7.2 7.9 7.9 7.9 ...
## $ Carrier.Market.Share..out.of.100. : num 12 12 12 12 12 8.8 8.8 39.7 39.7 39.7 ...
## $ Carrier.Load.Factor..out.of.100. : num 80.8 80.8 80.8 80.8 80.8 ...
## $ Carrier.On.Time.Performance.Rating..out.of.100.: num 70.3 70.3 70.3 70.3 70.3 91.8 91.8 87.4 87.4 87.4 ...
print("Lets find the summary of the dataset")
## [1] "Lets find the summary of the dataset"
summary(Airport_delay)
## Date Departure.Airport Departure.Airport.Rating..out.of.10.
## Length:14952 Length:14952 Min. :7.290
## Class :character Class :character 1st Qu.:7.290
## Mode :character Mode :character Median :7.990
## Mean :7.741
## 3rd Qu.:7.990
## Max. :8.270
## NA's :10043
## Departure.Airport.On.Time.Rating..out.of.10.
## Min. :6.200
## 1st Qu.:6.200
## Median :7.300
## Mean :6.908
## 3rd Qu.:7.300
## Max. :7.800
## NA's :10043
## Departure.Airport.Service.Rating..out.of.10. Arrival.Airport
## Min. :9.000 Length:14952
## 1st Qu.:9.000 Class :character
## Median :9.100 Mode :character
## Mean :9.064
## 3rd Qu.:9.100
## Max. :9.100
## NA's :10043
## Arrival.Airport.Rating..out.of.10. Arrival.Airport.On.Time.Rating..out.of.10.
## Min. :7.29 Min. :6.200
## 1st Qu.:7.99 1st Qu.:7.300
## Median :7.99 Median :7.300
## Mean :7.91 Mean :7.187
## 3rd Qu.:7.99 3rd Qu.:7.300
## Max. :8.27 Max. :7.800
##
## Arrival.Airport.Service.Rating..out.of.10. Airplane.Type
## Min. :9.000 Length:14952
## 1st Qu.:9.000 Class :character
## Median :9.100 Mode :character
## Mean :9.059
## 3rd Qu.:9.100
## Max. :9.100
##
## Expected.Departure.Time Departure.Time Departure.Delay
## Length:14952 Length:14952 Length:14952
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Duration Expected.Arrival.Time Arrival.Time Arrival.Time.Delay
## Length:14952 Length:14952 Length:14952 Length:14952
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Carrier Carrier.Rating..out.of.10.
## Length:14952 Min. :6.600
## Class :character 1st Qu.:6.800
## Mode :character Median :7.200
## Mean :7.531
## 3rd Qu.:7.900
## Max. :9.200
##
## Carrier.Market.Share..out.of.100. Carrier.Load.Factor..out.of.100.
## Min. : 3.6 Min. :80.75
## 1st Qu.: 4.0 1st Qu.:81.80
## Median :12.0 Median :86.00
## Mean :13.2 Mean :86.88
## 3rd Qu.:13.1 3rd Qu.:93.30
## Max. :39.7 Max. :93.90
##
## Carrier.On.Time.Performance.Rating..out.of.100.
## Min. :70.30
## 1st Qu.:74.70
## Median :87.40
## Mean :83.14
## 3rd Qu.:89.10
## Max. :91.80
##
attach(Airport_delay)
## The following object is masked from AQ_city_day:
##
## Date
## The following object is masked from AQ_station_day:
##
## Date
Airport_delay [Airport_delay == ""] <- NA
Airport_delay %>% group_by(Departure.Airport, Departure.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Departure.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 5 × 2
## # Groups: Departure.Airport [5]
## Departure.Airport Departure.Airport.On.Time.Rating..out.of.10.
## <chr> <dbl>
## 1 BLR NA
## 2 BOM 6.2
## 3 CCU NA
## 4 DEL 7.3
## 5 HYD 7.8
##Mumbai seems to have the worst rating for departure on time performance
Airport_delay %>% group_by(Arrival.Airport, Arrival.Airport.On.Time.Rating..out.of.10.)%>%summarize()
## `summarise()` has grouped output by 'Arrival.Airport'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 2
## # Groups: Arrival.Airport [3]
## Arrival.Airport Arrival.Airport.On.Time.Rating..out.of.10.
## <chr> <dbl>
## 1 BOM 6.2
## 2 DEL 7.3
## 3 HYD 7.8
##Mumbai seems to have the worst rating for Arrival on time performance as well
#################### Cleaning Datasets ######################
## Remove the entries from the table where tavg is NA
New_Weather_Bangalore <- Weather_Bangalore[complete.cases(Weather_Bangalore),]
New_Weather_Delhi <- Weather_Delhi[complete.cases(Weather_Delhi),]
New_Weather_Lucknow <- Weather_Lucknow[complete.cases(Weather_Lucknow),]
New_Weather_Mumbai <- Weather_Mumbai[complete.cases(Weather_Mumbai),]
New_Weather_Jodhpur <- Weather_Jodhpur[complete.cases(Weather_Jodhpur),]
## For Bhubhenshwar and Rourkela, we need to first remove the columns snow and tsun which has no valid entries
## We can also remove the wdir, wspd, pressure columns as the other stations are not having them
## And hence having them does not seem to add value for the scope of this analysis
Standard_Weather_Bhubhneshwar <- subset(Weather_Bhubhneshwar, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Bhubhneshwar <- Standard_Weather_Bhubhneshwar[complete.cases(Standard_Weather_Bhubhneshwar),]
Standard_Weather_Rourkela <- subset(Weather_Rourkela, select = -c(snow,wdir,wspd,pres,tsun,wpgt))
New_Weather_Rourkela <- Standard_Weather_Rourkela[complete.cases(Standard_Weather_Rourkela),]
## When it comes to AQI stations, we need only active stations
New_AQ_stations <- AQ_stations %>% filter(Status == "Active")
New_AQ_station_hour <- AQ_station_hour[complete.cases(AQ_station_hour),]
New_AQ_station_day <- AQ_station_day[complete.cases(AQ_station_day),]
New_AQ_city_hour <- AQ_city_hour[complete.cases(AQ_city_hour),]
New_AQ_city_day <- AQ_city_day[complete.cases(AQ_city_day),]
## Clean the Airport Delay data too
New_Airport_delay <- Airport_delay[complete.cases(Airport_delay),]
Detect outliers Clean Datasets
Since our analysis is to find if extreme weather conditions affect the flight traffic, we are
really looking for outliers unlike normal cases where we tend to avoid outliers
Exploratoray Analysis of Bangalore Weather Dataset
hist(x=New_Weather_Bangalore$tavg, main = "Bangalore Average Temparature")

## Data outside <20 and >30 are outliers for Bangalore average
hist(x=New_Weather_Bangalore$tmin, main = "Bangalore Min Temparature")

## Data outside <16 are outliers for Bangalore min
hist(x=New_Weather_Bangalore$tmax, main = "Bangalore Max Temparature")

## Data outside >35 are outliers for Bangalore min
hist(x=New_Weather_Bangalore$prcp, main = "Bangalore Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Bangalore <- New_Weather_Bangalore %>% filter((tavg < 20) | (tavg>30) | (tmin < 16) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bangalore, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Bangalore$prcp/75) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 16 to 22
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bangalore <- Special_Weather_Bangalore %>% filter((tmin > 16) & (tmin < 22))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bangalore, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Bangalore$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Chennai Weather Dataset
Exploratory Analysis of Delhi Weather Dataset
hist(x=New_Weather_Delhi$tavg, main = "Delhi Average Temparature")

## Data outside <15 and >35 are outliers for Delhi average
hist(x=New_Weather_Delhi$tmin, main = "Delhi Min Temparature")

## Data outside <16 are outliers for Delhi min
hist(x=New_Weather_Delhi$tmax, main = "Delhi Max Temparature")

## Data outside >35 are outliers for Delhi min
hist(x=New_Weather_Delhi$prcp, main = "Delhi Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Delhi <- New_Weather_Delhi %>% filter((tavg < 15) | (tavg>35) | (tmin < 10) | (tmax > 30) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Delhi, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Delhi$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Delhi <- Special_Weather_Delhi %>% filter((tmin > 20) & (tmin < 30))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Delhi, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Delhi$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Lucknow Weather Dataset
hist(x=New_Weather_Lucknow$tavg, main = "Lucknow Average Temparature")

## Data outside <16 and >33 are outliers for Lucknow average
hist(x=New_Weather_Lucknow$tmin, main = "Lucknow Min Temparature")

## Data outside <15 are outliers for Lucknow min
hist(x=New_Weather_Lucknow$tmax, main = "Lucknow Max Temparature")

## Data outside >35 are outliers for Lucknow min
hist(x=New_Weather_Lucknow$prcp, main = "Lucknow Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Lucknow <- New_Weather_Lucknow %>% filter((tavg < 16) | (tavg>33) | (tmin < 15) | (tmax > 30) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Lucknow, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Lucknow$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 20 to 30
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Lucknow <- Special_Weather_Lucknow %>% filter((tmin > 20) & (tmin < 30))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Lucknow, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Lucknow$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Mumbai Weather Dataset
hist(x=New_Weather_Mumbai$tavg, main = "Mumbai Average Temparature")

## Data outside <25 and >30 are outliers for Mumbai average
hist(x=New_Weather_Mumbai$tmin, main = "Mumbai Min Temparature")

## Data outside <17 are outliers for Mumbai min
hist(x=New_Weather_Mumbai$tmax, main = "Mumbai Max Temparature")

## Data outside >35 are outliers for Mumbai min
hist(x=New_Weather_Mumbai$prcp, main = "Mumbai Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Mumbai <- New_Weather_Mumbai %>% filter((tavg < 25) | (tavg>30) | (tmin < 17) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Mumbai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Mumbai$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Mumbai <- Special_Weather_Mumbai %>% filter((tmin > 22) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Mumbai, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Mumbai$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Jodhpur Weather Dataset
hist(x=New_Weather_Jodhpur$tavg, main = "Jodhpur Average Temparature")

## Data outside <22 and >28 are outliers for Jodhpur average
hist(x=New_Weather_Jodhpur$tmin, main = "Jodhpur Min Temparature")

## Data outside <16 are outliers for Jodhpur min
hist(x=New_Weather_Jodhpur$tmax, main = "Jodhpur Max Temparature")

## Data outside >33 are outliers for Jodhpur min
hist(x=New_Weather_Jodhpur$prcp, main = "Jodhpur Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Jodhpur <- New_Weather_Jodhpur %>% filter((tavg < 22) | (tavg>28) | (tmin < 16) | (tmax > 33) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Jodhpur, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Jodhpur$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 23
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Jodhpur <- Special_Weather_Jodhpur %>% filter((tmin > 17) & (tmin < 23))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Jodhpur, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Jodhpur$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Bhubhenshwar Weather Dataset
hist(x=New_Weather_Bhubhneshwar$tavg, main = "Bhubhenshwar Average Temparature")

## Data outside <24 and >32 are outliers for Bhubhenshwar average
hist(x=New_Weather_Bhubhneshwar$tmin, main = "Bhubhenshwar Min Temparature")

## Data outside <15 are outliers for Bhubhenshwar min
hist(x=New_Weather_Bhubhneshwar$tmax, main = "Bhubhenshwar Max Temparature")

## Data outside >35 are outliers for Bhubhenshwar min
hist(x=New_Weather_Bhubhneshwar$prcp, main = "Bhubhenshwar Precipitation", breaks = 5)

## Extreme cases are above 50
## So lets make special dataset
Special_Weather_Bhubhenshwar <- New_Weather_Bhubhneshwar %>% filter((tavg < 24) | (tavg>32) | (tmin < 15) | (tmax > 35) | (prcp > 50))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Bhubhenshwar, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Bhubhenshwar$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 17 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Bhubhenshwar <- Special_Weather_Bhubhenshwar %>% filter((tmin > 17) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Bhubhenshwar, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Bhubhenshwar$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of Rourkela Weather Dataset
hist(x=New_Weather_Rourkela$tavg, main = "Rourkela Average Temparature")

## Data outside <20 and >32 are outliers for Rourkela average
hist(x=New_Weather_Rourkela$tmin, main = "Rourkela Min Temparature")

## Data outside <15 are outliers for Rourkela min
hist(x=New_Weather_Rourkela$tmax, main = "Rourkela Max Temparature")

## Data outside >35 are outliers for Rourkela min
hist(x=New_Weather_Rourkela$prcp, main = "Rourkela Precipitation", breaks = 5)

## Extreme cases are above 40
## So lets make special dataset
Special_Weather_Rourkela <- New_Weather_Rourkela %>% filter((tavg < 20) | (tavg>32) | (tmin < 15) | (tmax > 30) | (prcp > 40))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Special_Weather_Rourkela, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Special_Weather_Rourkela$prcp/50) +
labs(title = "Impact of temperature on precipitation")

## From the picture looks like the extreme precipitation happens either during when tmin is between 22 to 27
## So, lets put a special filter around that and redraw
Ext_Special_Weather_Rourkela <- Special_Weather_Rourkela %>% filter((tmin > 22) & (tmin < 27))
## And since the precipitation makes the most impact on flights, lets take a look at how precipitation gets impacted by temperatures
ggplot(Ext_Special_Weather_Rourkela, aes(x = tmin,
y = tmax,
color = prcp)) +
geom_point(size = Ext_Special_Weather_Rourkela$prcp/75) +
labs(title = "Impact of temperature on precipitation")

Exploratory Analysis of AQI data station wise
head(New_AQ_station_hour)
## StationId Datetime PM2.5 PM10 NO NO2 NOx NH3 CO SO2
## 17 AP001 2017-11-25 09:00:00 104.00 148.50 1.93 23.00 13.75 9.80 0.1 15.30
## 18 AP001 2017-11-25 10:00:00 94.50 142.00 1.33 16.25 9.75 9.65 0.1 17.00
## 19 AP001 2017-11-25 11:00:00 82.75 126.50 1.47 14.83 9.07 9.70 0.1 15.40
## 22 AP001 2017-11-25 14:00:00 68.50 117.00 1.35 13.60 8.35 7.40 0.1 21.80
## 23 AP001 2017-11-25 15:00:00 69.25 112.25 1.52 11.80 7.55 9.25 0.1 21.38
## 24 AP001 2017-11-25 16:00:00 70.00 107.00 2.80 30.33 18.40 6.15 0.1 18.90
## O3 Benzene Toluene Xylene AQI AQI_Bucket
## 17 117.62 0.30 10.40 0.23 155 Moderate
## 18 136.23 0.28 7.10 0.15 159 Moderate
## 19 149.92 0.20 4.55 0.08 173 Moderate
## 22 161.70 0.10 2.30 0.00 191 Moderate
## 23 161.68 0.10 2.35 0.00 191 Moderate
## 24 147.97 0.10 3.70 0.00 191 Moderate
# Lets see the performance of the AQI over years
AQ_station_Day_Sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))
AQ_station_Day_Duration <- AQ_station_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))
AQI_Over_Years <- AQ_station_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+ geom_line()

## It appears that 'Severe' and 'Poor' cases didnt exist too predominantly until 2017 from which these
## two gained at the behest of 'Good' AQI cases
# Lets see the performance of the AQI over a day in every year
AQI_Over_Time <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that 2017 had the worst air quality index but worst was during day time
## Things slowed down in the years later but in them, night time pollution was high.
## In all cases, early morning pollution was the lowest.
# Lets see the performance of the AQI monthwise
AQI_monthwise <- AQ_station_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.
AQI_Over_month <- AQ_station_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the colder months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values
## Now lets report this city wise - probably for the Month wise combination
AQI_Stationwise <- AQ_station_Day_Duration %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Stationwise, aes(x = Month, y = Mean_AQI, color = Station))+ geom_point(shape = AQI_Stationwise$Duration)

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.
## Now out of the 19 stations, we are very interested on just interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_Station <- AQ_station_Day_Duration %>% filter( (StationId == "DL001") | (StationId == "DL019")) %>% group_by(Station = StationId, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Station', 'YEAR', 'Month'. You can
## override using the `.groups` argument.
ggplot(AQI_Delhi_Station, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_Delhi_Station$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

# Lets see how AQ day data is different from station hour wise data
New_AQ_station_day_Years <- New_AQ_station_day%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
head(AQ_station_Day_Duration)
## StationId Date Hr Min Sec PM2.5 PM10 NO NO2 NOx NH3 CO
## 17 AP001 2017-11-25 09 00 00 104.00 148.50 1.93 23.00 13.75 9.80 0.1
## 18 AP001 2017-11-25 10 00 00 94.50 142.00 1.33 16.25 9.75 9.65 0.1
## 19 AP001 2017-11-25 11 00 00 82.75 126.50 1.47 14.83 9.07 9.70 0.1
## 22 AP001 2017-11-25 14 00 00 68.50 117.00 1.35 13.60 8.35 7.40 0.1
## 23 AP001 2017-11-25 15 00 00 69.25 112.25 1.52 11.80 7.55 9.25 0.1
## 24 AP001 2017-11-25 16 00 00 70.00 107.00 2.80 30.33 18.40 6.15 0.1
## SO2 O3 Benzene Toluene Xylene AQI AQI_Bucket Hour Duration
## 17 15.30 117.62 0.30 10.40 0.23 155 Moderate 9 Day
## 18 17.00 136.23 0.28 7.10 0.15 159 Moderate 10 Day
## 19 15.40 149.92 0.20 4.55 0.08 173 Moderate 11 Day
## 22 21.80 161.70 0.10 2.30 0.00 191 Moderate 14 Day
## 23 21.38 161.68 0.10 2.35 0.00 191 Moderate 15 Day
## 24 18.90 147.97 0.10 3.70 0.00 191 Moderate 16 Day
## There seems to be nothing new that we can derive out of the station day wise that we can't derive out of
## station hour wise data. so no further analysis needed over here
Exploratory Analysis of AQI data city wise
## Lets look at City wise hourly AQI data
head(New_AQ_city_hour)
## City Datetime PM2.5 PM10 NO NO2 NOx NH3 CO
## 50889 Amaravati 2017-11-25 09:00:00 104.00 148.50 1.93 23.00 13.75 9.80 0.1
## 50890 Amaravati 2017-11-25 10:00:00 94.50 142.00 1.33 16.25 9.75 9.65 0.1
## 50891 Amaravati 2017-11-25 11:00:00 82.75 126.50 1.47 14.83 9.07 9.70 0.1
## 50894 Amaravati 2017-11-25 14:00:00 68.50 117.00 1.35 13.60 8.35 7.40 0.1
## 50895 Amaravati 2017-11-25 15:00:00 69.25 112.25 1.52 11.80 7.55 9.25 0.1
## 50896 Amaravati 2017-11-25 16:00:00 70.00 107.00 2.80 30.33 18.40 6.15 0.1
## SO2 O3 Benzene Toluene Xylene AQI AQI_Bucket
## 50889 15.30 117.62 0.30 10.40 0.23 155 Moderate
## 50890 17.00 136.23 0.28 7.10 0.15 159 Moderate
## 50891 15.40 149.92 0.20 4.55 0.08 173 Moderate
## 50894 21.80 161.70 0.10 2.30 0.00 191 Moderate
## 50895 21.38 161.68 0.10 2.35 0.00 191 Moderate
## 50896 18.90 147.97 0.10 3.70 0.00 191 Moderate
# Lets see the performance of the AQI over years
AQ_city_Day_Sep <- New_AQ_city_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr))
AQ_city_Day_Duration <- AQ_city_Day_Sep %>% mutate(Duration=cut(Hour, breaks=c(-1, 6, 18, 24),labels=c("Early_Morning","Day","Night")))
##
AQI_City_Over_Years <- AQ_city_Day_Duration%>% group_by(YEAR = year(ymd(Date)), AQI_Bucket) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
AQI_City_Over_Years
## # A tibble: 35 × 3
## # Groups: YEAR [6]
## YEAR AQI_Bucket Mean_AQI
## <dbl> <chr> <dbl>
## 1 2015 Moderate 152.
## 2 2015 Poor 249.
## 3 2015 Satisfactory 82.0
## 4 2015 Severe 442.
## 5 2015 Very Poor 348.
## 6 2016 Good 40.5
## 7 2016 Moderate 125.
## 8 2016 Poor 229.
## 9 2016 Satisfactory 82.8
## 10 2016 Severe 455.
## # ℹ 25 more rows
ggplot(AQI_City_Over_Years, aes(x = YEAR, y = Mean_AQI, color=AQI_Bucket))+ geom_line()

## We can see that the colder months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season.
# Lets see the performance of the AQI over a day in every year
AQI_City_Over_Time <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_City_Over_Time, aes(x = YEAR, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_Over_Time$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## It appears 2015 had peak values of AQIs, which dropped to very low in 2016, gained to half the levels back in 2017 and then gradually reducing
## We can see that 2015-2017 worst was during day time but from 2018, there were worse night times - may be something to do with dropped levels of AQIs as well
## In all cases, early morning pollution seems to be the lowest.
# Lets see the performance of the AQI monthwise
AQI_City_monthwise <- AQ_city_Day_Duration %>% group_by(Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
ggplot(AQI_City_monthwise, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_monthwise$Duration)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see that the winter months - i.e., from Oct to Feb, the AQI is the worst, its bad during summer but it appears the best in monsoon season. The difference between stationwise data is that, here Nov seems to be the worst month while in ther other dataset, Dec held the worst...
AQI_City_Over_month <- AQ_city_Day_Duration %>% group_by(YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE), Duration) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
AQI_City_Over_month
## # A tibble: 185 × 4
## # Groups: YEAR, Month [63]
## YEAR Month Duration Mean_AQI
## <dbl> <ord> <fct> <dbl>
## 1 2015 Jan Early_Morning 343.
## 2 2015 Jan Day 341.
## 3 2015 Jan Night 341.
## 4 2015 Feb Early_Morning 329.
## 5 2015 Feb Day 329.
## 6 2015 Feb Night 325.
## 7 2015 Mar Early_Morning 249.
## 8 2015 Mar Day 262.
## 9 2015 Mar Night 254.
## 10 2015 Apr Early_Morning 304.
## # ℹ 175 more rows
# Lets see if how this works out yearwise and monthwise
ggplot(AQI_City_Over_month, aes(x = Month, y = Mean_AQI, color = Duration))+ geom_point(size = AQI_City_Over_month$Duration) + facet_wrap(~YEAR)
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors
## Warning in Ops.factor(coords$size, .pt): '*' not meaningful for factors

## We can see the same trend every year - i.e., the winter months has the worst AQI while the monsoon has the best AQI while summer/spring time having the intermediate values
## Now lets report this city wise - probably for the Month wise combination
AQI_Citywise <- AQ_city_Day_Sep %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Citywise, aes(x = Month, y = Mean_AQI))+ geom_point(aes(color=City))

## Across stations, the trend seems to be the same - i.e., worst during winter, intermediate during spring/summer, best during monsoon.
## Now out of all the cities, we are very interested on Delhi for which we are going to do air traffic impact analysis - so lets filter them and zoom into their performance alone
AQI_Delhi_City <- AQ_city_Day_Sep %>% filter( City == "Delhi") %>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
ggplot(AQI_Delhi_City, aes(x = Month, y = Mean_AQI))+ geom_point() + facet_wrap(~YEAR)

New_AQ_city_day_Years <- New_AQ_city_day%>% group_by(City, YEAR = year(ymd(Date)), Month = month(ymd(Date), label = TRUE)) %>% summarize(Mean_AQI = mean(AQI))
## `summarise()` has grouped output by 'City', 'YEAR'. You can override using the
## `.groups` argument.
head(New_AQ_city_day_Years)
## # A tibble: 6 × 4
## # Groups: City, YEAR [2]
## City YEAR Month Mean_AQI
## <chr> <dbl> <ord> <dbl>
## 1 Amaravati 2017 Nov 184.
## 2 Amaravati 2017 Dec 194.
## 3 Amaravati 2018 Jan 172.
## 4 Amaravati 2018 Feb 107.
## 5 Amaravati 2018 Mar 84.6
## 6 Amaravati 2018 Apr 63.8
ggplot(New_AQ_city_day_Years, aes(x = Month, y = Mean_AQI))+ geom_point(aes(color=City))

## There seems to be small difference when comparing hour wise data to day wise data, but not significant enough. So we will live with the hour wise data itself for cities.
Analyse the parameters impacting the AQI index
## We would like to understand which are the parameters are really affecting AQI value.
## Based on the analysis above we will stick to using the Cleaned Station hour wise datasets.
New_AQ_station_hour_sep <- New_AQ_station_hour %>% separate(Datetime, c('Date', 'Time'), sep =" ") %>% separate(Time, c('Hr', 'Min', 'Sec'), sep=":") %>% mutate(Hour = as.numeric(Hr), Month = month(ymd(Date)))
## Now lets focus on the months where we have the most troubles with AQI - Oct to Feb
New_AQ_station_hour_sep_BM <- New_AQ_station_hour_sep %>% filter ((Month == 1) | (Month == 2) | (Month == 10) | (Month == 11) | (Month == 12))
AQI_O3_model <- lm(AQI~O3, data = New_AQ_station_hour_sep)
fmodel(AQI_O3_model)

## OK vow, looks like AQI has direct relationship with the O3 content
AQI_O3_model_BM <- lm(AQI~O3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_O3_model_BM)

## In bad months looks like O3 and AQI are inversely proportional
## Lets try with PM2.5
AQI_PM_2_5_model <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_2_5_model)

## OK even here there is an impact - actually much more
AQI_PM_2_5_model_BM <- lm(AQI~PM2.5, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_2_5_model_BM)

## PM2.5 impact seems to be much higher over the winter months
##Lets try others
AQI_PM_10_model <- lm(AQI~PM10, data = New_AQ_station_hour_sep)
fmodel(AQI_PM_10_model)

AQI_PM_10_model_BM <- lm(AQI~PM10, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_PM_10_model_BM)

## No significant impact change in winter months for PM10
AQI_NO_model <- lm(AQI~NO, data = New_AQ_station_hour_sep)
fmodel(AQI_NO_model)

AQI_NO_model_BM <- lm(AQI~NO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO_model_BM)

## Slight reduction in winter months for NO
AQI_NO2_model <- lm(AQI~NO2, data = New_AQ_station_hour_sep)
fmodel(AQI_NO2_model)

AQI_NO2_model_BM <- lm(AQI~NO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NO2_model_BM)

## No significant impact change in winter months for NO2
AQI_NOx_model <- lm(AQI~NOx, data = New_AQ_station_hour_sep)
fmodel(AQI_NOx_model)

AQI_NOx_model_BM <- lm(AQI~NOx, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NOx_model_BM)

## Slight reduction in winter months for NOx
AQI_NH3_model <- lm(AQI~NH3, data = New_AQ_station_hour_sep)
fmodel(AQI_NH3_model)

AQI_NH3_model_BM <- lm(AQI~NH3, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_NH3_model_BM)

## PM2.5 impact seems to be much higher (50% more) over the winter months
AQI_CO_model <- lm(AQI~CO, data = New_AQ_station_hour_sep)
fmodel(AQI_CO_model)

AQI_CO_model_BM <- lm(AQI~CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_CO_model_BM)

## No significant impact change in winter months for CO
AQI_SO2_model <- lm(AQI~SO2, data = New_AQ_station_hour_sep)
fmodel(AQI_SO2_model)

AQI_SO2_model_BM <- lm(AQI~SO2, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_SO2_model_BM)

## Slight reduction in winter months for SO2
AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep)
fmodel(AQI_Benzene_model)

AQI_Benzene_model <- lm(AQI~Benzene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Benzene_model)

## Slight reduction in winter months for Benzene
AQI_Toluene_model <- lm(AQI~Toluene, data = New_AQ_station_hour_sep)
fmodel(AQI_Toluene_model)

AQI_Toluene_model_BM <- lm(AQI~Toluene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Toluene_model_BM)

## Slight reduction in winter months for Toulene
AQI_Xylene_model <- lm(AQI~Xylene, data = New_AQ_station_hour_sep)
fmodel(AQI_Xylene_model)

AQI_Xylene_model_BM <- lm(AQI~Xylene, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_Xylene_model_BM)

## No significant impact change in winter months for Xylene
## Among these, the highest impact seems to be from PM25.5, CO. Bringing in
## O3 due to their peculiar reversal in Winter months
AQI_High_Impact_model <- lm(AQI~PM2.5+O3+CO, data = New_AQ_station_hour_sep)
fmodel(AQI_High_Impact_model)

AQI_High_Impact_model_BM <- lm(AQI~PM2.5+O3+CO, data = New_AQ_station_hour_sep_BM)
fmodel(AQI_High_Impact_model_BM)

Analyse cohesive dataset a bit to understand how delay and other parameters plot each other
names(Delhi_cohesive_dataset)
## [1] "Date" "tavg" "tmin" "tmax" "prcp"
## [6] "StationId" "PM2.5" "PM10" "NO" "NO2"
## [11] "NOx" "NH3" "CO" "SO2" "O3"
## [16] "Benzene" "Toluene" "Xylene" "AQI" "AQI_Bucket"
## [21] "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, color = AQI_Bucket, size = prcp)) +
geom_point() +
labs(title = "Impact of AQI and prcp")

##As per the plot, Good AQI too gets observed for some delay cases but they are far and few... and does not seems to have caused high amount of delays
## There area huge amount of delays caused for satisfactory AQI cases but most of the delays could be associated
## with pretty high precipitation
## There are good amount of delays associate with moderate cases too and they do have caused significant delays when combined with high precipitations
## Delay instances reduces for Poor AQI cases but there is a slight increase in the values of delays
## For very poor cases, impact gets high when combined with precipitation
## Severe cases are high impact ones but looks like not affected with precipitation
## Now lets view this purely from the weather perspective
names(Delhi_cohesive_dataset)
## [1] "Date" "tavg" "tmin" "tmax" "prcp"
## [6] "StationId" "PM2.5" "PM10" "NO" "NO2"
## [11] "NOx" "NH3" "CO" "SO2" "O3"
## [16] "Benzene" "Toluene" "Xylene" "AQI" "AQI_Bucket"
## [21] "Daily_Delay" "Month"
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, color = tmin, size = prcp)) +
geom_point() +
labs(title = "Impact of temp and prcp")

## Its clear that bigger precipitation brings in more instances of delays
## But its also interesting to find that higher tavg, higher precipitation and higher tmin bring
# in a lot of delays - though size of precipitation does not always result in costly delays
## Ok Lets also analyse if the components O3, PM2.5 and CO has impacts on delays
ggplot(Delhi_cohesive_dataset, aes(x = O3, y = Daily_Delay, size = O3)) +
geom_point() +
labs(title = "Impact of O3")

## Looks like more O3 directly relates to higher delays
ggplot(Delhi_cohesive_dataset, aes(x = PM2.5, y = Daily_Delay, size = PM2.5)) +
geom_point() +
labs(title = "Impact of PM2.5")

## Looks like more PM2.5 might not have too much impact...
ggplot(Delhi_cohesive_dataset, aes(x = CO, y = Daily_Delay, size = CO)) +
geom_point() +
labs(title = "Impact of CO")

## Looks like size of CO has some correlation but may not be linear...
ggplot(Delhi_cohesive_dataset, aes(x = PM10, y = Daily_Delay, size = PM10)) +
geom_point() +
labs(title = "Impact of PM10")

## Looks like more PM2.5 might not have too much impact...
ggplot(Delhi_cohesive_dataset, aes(x = prcp, y = Daily_Delay, size = prcp)) +
geom_point() +
labs(title = "Impact of rain")

## Looks like amount of rain has direct impact on delays...
ggplot(Delhi_cohesive_dataset, aes(x = tavg, y = Daily_Delay, size = tavg)) +
geom_point() +
labs(title = "Impact of Average Temp")

## Looks like a lot of low intensity delays on higher average temprature...
ggplot(Delhi_cohesive_dataset, aes(x = tmin, y = Daily_Delay, size = tmin)) +
geom_point() +
labs(title = "Impact of Tmin")

## Looks like a lot of low intensity delays on higher average temprature...
ggplot(Delhi_cohesive_dataset, aes(x = AQI, y = Daily_Delay, size = AQI, color=AQI_Bucket)) +
geom_point() +
labs(title = "Impact of AQI")

## Looks like a lot of low intensity delays on higher Tmin...
## Lets see if the months itself has any impact on the delay
ggplot(Delhi_cohesive_dataset, aes(x = Month, y = Daily_Delay, size = Daily_Delay)) +
geom_point()+ scale_x_continuous(breaks=seq(1, 12, by = 1))+
labs(title = "Impact of Month")

## Looks like there is high frequency of delays during monsoon and heavy delay during peak winter season
## Ok based on this, lets pick these elements to find the right model on impacts the delays of Delhi airtraffic:
## Precipitation, AQI, tmin, O3 and CO
## Lets see how the elements individually have linear regression relationship with the traffic delay
## Ok lets build the base model here
Delhi_Traffic_Delay_Model_AQI = lm(Daily_Delay ~ AQI, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_AQI)

Delhi_Traffic_Delay_Model_tavg = lm(Daily_Delay ~ AQI+tavg, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_tavg)

Delhi_Traffic_Delay_Model_prcp = lm(Daily_Delay ~ AQI+prcp, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_prcp)

Delhi_Traffic_Delay_Model_O3 = lm(Daily_Delay ~ AQI+O3, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_O3)

Delhi_Traffic_Delay_Model_CO = lm(Daily_Delay ~ AQI+CO, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_CO)

Delhi_Traffic_Delay_Model_Month = lm(Daily_Delay ~ AQI+Month, data = Delhi_cohesive_dataset)
fmodel(Delhi_Traffic_Delay_Model_Month)

evaluate_model(Delhi_Traffic_Delay_Model_AQI)
## AQI model_output
## 1 0 65.89103
## 2 200 75.90179
## 3 400 85.91255
evaluate_model(Delhi_Traffic_Delay_Model_tavg, tavg = 35)
## AQI tavg model_output
## 1 0 35 65.28536
## 2 200 35 74.13083
## 3 400 35 82.97629
evaluate_model(Delhi_Traffic_Delay_Model_prcp, prcp = 150)
## AQI prcp model_output
## 1 0 150 207.8647
## 2 200 150 221.3815
## 3 400 150 234.8983
evaluate_model(Delhi_Traffic_Delay_Model_O3, O3 = 50)
## AQI O3 model_output
## 1 0 50 75.87323
## 2 200 50 86.52229
## 3 400 50 97.17135
evaluate_model(Delhi_Traffic_Delay_Model_CO, CO = 1)
## AQI CO model_output
## 1 0 1 66.60121
## 2 200 1 76.02051
## 3 400 1 85.43981
evaluate_model(Delhi_Traffic_Delay_Model_Month, Month = 12)
## AQI Month model_output
## 1 0 12 84.65495
## 2 200 12 100.87275
## 3 400 12 117.09055
diff_1 <- 75.90179 - 65.89103
diff_1
## [1] 10.01076
diff_2 <- 74.13083 - 65.28536
diff_2
## [1] 8.84547
diff_3 <- 221.3815 - 207.8647
diff_3
## [1] 13.5168
diff_4 <- 86.52229 - 75.87323
diff_4
## [1] 10.64906
diff_5 <- 76.02051 - 66.60121
diff_5
## [1] 9.4193
diff_6 <- 100.87275 - 84.65495
diff_6
## [1] 16.2178
# Comparing the model evalution based on above, we can see that prcp, month and O3 has good impact
# on the delay
## To evaluate the base model, split the data into test and train datasets
#make this split reproducible
set.seed(1)
#Use 70% of dataset as training set and remaining 30% as testing set
sample_set <- sample(c(TRUE, FALSE), nrow(Delhi_cohesive_dataset), replace=TRUE, prob=c(0.7,0.3))
train_dataset <- Delhi_cohesive_dataset[sample_set, ]
test_dataset <- Delhi_cohesive_dataset[!sample_set, ]
# the base model with just AQI and tavg
Base_Model_Delay = lm(Daily_Delay ~ AQI+prcp, data = train_dataset)
# the Augmented model with precipitation as well
Aug_Model_Delay = lm(Daily_Delay ~ AQI+prcp+Month, data = train_dataset)
# Run cross validation trials on the two models
trials <- cv_pred_error(Base_Model_Delay, Aug_Model_Delay)
# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = -2.8426, df = 5.7322, p-value = 0.03099
## alternative hypothesis: true difference in means between group Aug_Model_Delay and group Base_Model_Delay is not equal to 0
## 95 percent confidence interval:
## -185.63143 -12.83777
## sample estimates:
## mean in group Aug_Model_Delay mean in group Base_Model_Delay
## 3091.307 3190.542
# t-statistic is -2.8426. degrees of freedom, df is 5.7322 are the degrees of freedom. These are used with a t-distribution to derive p-value of 0.03099
# p-value = 0.03099 - i.e., Given that there is no actual/true difference in means, if we repeat the experiment over and over again, 3.1% of the time we would see the type of difference in means as in your samples, or a more extreme difference in means. Since p value is significantly lower than 0.05, the differences are significant.
# So we can reject the null hypothesis (H0) of no difference between the (true) averages of the two groups
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -185.63143 1-12.83777
#If assume H0 is false, the true mean may lie in the interval [3091.307 3190.542].
# So we will chose the augmented model - i.e., Daily_Delay ~ AQI+prcp+Mont
Model for predicting Delhi air traffic delays
## For our model to predict the air traffic delays:
## Response Variable is Daily_Delay
## Explanatory Variables are Precipitation (prcp), AQI and Month
## We are choosing a linear regression model here because this is about predicting the numerical values
## and does not belong to classification modelling
Delhi_Traffic_Delay_Model = lm(Daily_Delay ~ AQI+prcp+Month, data = train_dataset)
summary(Delhi_Traffic_Delay_Model)
##
## Call:
## lm(formula = Daily_Delay ~ AQI + prcp + Month, data = train_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.585 -30.554 -12.036 6.998 284.359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.63157 11.80968 2.509 0.01296 *
## AQI 0.08275 0.03793 2.182 0.03038 *
## prcp 0.93401 0.41760 2.237 0.02651 *
## Month 3.92586 1.23860 3.170 0.00179 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.59 on 185 degrees of freedom
## Multiple R-squared: 0.08357, Adjusted R-squared: 0.06871
## F-statistic: 5.623 on 3 and 185 DF, p-value: 0.001038
Predicted_Traffic_Delay <- predict(Delhi_Traffic_Delay_Model, test_dataset)
Predicted_Traffic_Delay
## 4 6 7 15 17 18 20 21
## 70.04663 70.48225 66.13254 86.71768 80.13123 128.87608 69.57199 105.29059
## 29 35 37 39 41 43 46 49
## 69.53240 93.94147 98.30356 110.55783 98.28110 95.03004 68.54983 61.05614
## 52 61 68 70 72 76 77 79
## 57.72089 65.94984 48.48926 66.38734 71.30725 59.62546 57.14291 62.60452
## 80 82 85 87 94 95 96 99
## 48.07795 50.80630 48.92676 57.18521 55.47693 54.07015 54.31841 49.84982
## 104 109 111 112 117 121 125 135
## 75.11322 54.87521 55.84448 70.11429 65.36219 66.46084 72.88393 66.38080
## 139 145 148 150 162 164 165 169
## 74.59925 76.01754 112.17120 71.53428 66.28567 63.65000 64.39476 85.73501
## 172 173 176 178 180 183 185 187
## 65.75533 69.72740 67.32761 69.67660 70.51966 79.32889 64.51405 72.55232
## 188 189 191 194 198 200 210 211
## 71.22830 108.22343 70.51966 66.08633 70.72042 69.75935 78.49948 72.44394
## 213 214 215 218 219 225 230 243
## 72.24649 69.59844 72.19569 72.49474 102.32417 68.63737 76.00685 94.49699
## 250 251 252 260 264 265
## 134.81339 90.11819 89.78719 53.16956 62.75636 61.51509
test_dataset["Predicted_Delay"] <- Predicted_Traffic_Delay
Summary_Model_Performace <- test_dataset %>% group_by(YEAR = year(ymd(Date)), Month) %>% summarise(Daily_Delay, Predicted_Delay)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'YEAR', 'Month'. You can override using the
## `.groups` argument.
Summary_Model_Performace
## # A tibble: 78 × 4
## # Groups: YEAR, Month [18]
## YEAR Month Daily_Delay Predicted_Delay
## <dbl> <dbl> <dbl> <dbl>
## 1 2018 6 30 70.0
## 2 2018 7 31 70.5
## 3 2018 7 8 66.1
## 4 2018 7 59 86.7
## 5 2018 7 45 80.1
## 6 2018 7 62 129.
## 7 2018 7 100 69.6
## 8 2018 7 61 105.
## 9 2018 8 105 69.5
## 10 2018 11 60 93.9
## # ℹ 68 more rows
ggplot(Summary_Model_Performace, aes(x = Month)) +
geom_point(aes(y = Daily_Delay, color = 'Daily_Delay')) +
geom_point(aes(y = Predicted_Delay, color = 'Predictede_Delay')) +
scale_x_continuous(breaks=seq(1, 12, by = 1))+
labs(title = "Model Performance") + facet_wrap(~YEAR)

# As we can see, the model is performing a bit OK for some months except for certain extreme
# cases of delays. So, the model needs further fine tuning or dataset needs to be reanalyzed.